PFAS

Author

Casas Pontanillo, Oliver Arturo

Ejemplo PFAS

Primero importamos las librerias ncesesarias.

import pandas as pd
import numpy as np
import plotly.express as px
pfas_data = pd.read_csv("../data/pfas_data.csv")
pfas_data.head()

pfas_data.info()

pfas_data["RDKIT_SMILES"].isna().mean()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6071 entries, 0 to 6070
Columns: 2091 entries, RDKIT_SMILES to PubchemFP880
dtypes: float64(1176), int64(914), object(1)
memory usage: 96.9+ MB
np.float64(0.0)
molecules = pfas_data["RDKIT_SMILES"].tolist()
molecules[:5]
['FC(F)Cl',
 'FC(F)=C(F)F',
 'FC(F)(Cl)C(F)(Cl)Cl',
 'C=C(F)F',
 'OC(C(F)(F)F)C(F)(F)F']
!pip install rdkit
Requirement already satisfied: rdkit in /opt/venv/lib/python3.13/site-packages (2025.3.5)
Requirement already satisfied: numpy in /opt/venv/lib/python3.13/site-packages (from rdkit) (2.3.2)
Requirement already satisfied: Pillow in /opt/venv/lib/python3.13/site-packages (from rdkit) (11.3.0)
from rdkit import Chem
from rdkit.Chem import AllChem
mols = [Chem.MolFromSmiles(smiles) for smiles in molecules]
mols[:5]
[<rdkit.Chem.rdchem.Mol at 0xffff701bdaf0>,
 <rdkit.Chem.rdchem.Mol at 0xffff701bdc40>,
 <rdkit.Chem.rdchem.Mol at 0xffff701bdcb0>,
 <rdkit.Chem.rdchem.Mol at 0xffff701bdd20>,
 <rdkit.Chem.rdchem.Mol at 0xffff701bdd90>]
fps = [AllChem.GetMACCSKeysFingerprint(mol) for mol in mols]
fps[:5]
[<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff703312a0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff70331620>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff70331690>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff70331700>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff70331770>]
fps_array = np.array(fps)
fps_array
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], shape=(6071, 167))
from sklearn.manifold import TSNE
tsne = TSNE(n_components = 2, perplexity = 50, random_state = 42)
fps_tsne = tsne.fit_transform(fps_array)
tsne_df = pd.DataFrame(data = fps_tsne, columns = ["Component_1", "Component_2"], index = pfas_data["RDKIT_SMILES"])
tsne_df.head()
Component_1 Component_2
RDKIT_SMILES
FC(F)Cl -56.922298 60.720055
FC(F)=C(F)F -69.412216 26.313509
FC(F)(Cl)C(F)(Cl)Cl -32.573349 29.395290
C=C(F)F -69.444672 26.623600
OC(C(F)(F)F)C(F)(F)F -22.384430 -27.337769
pfas_classes = pd.read_csv("../data/pfas_classes.csv", index_col = "RDKIT_SMILES")
tsne_df_joined = pd.merge(tsne_df, pfas_classes, on = "RDKIT_SMILES", how = "inner")
tsne_df_joined.head()
Component_1 Component_2 First_Class Second_Class
RDKIT_SMILES
FC(F)Cl -56.922298 60.720055 PFAS derivatives PFAS halogen derivatives
FC(F)=C(F)F -69.412216 26.313509 PFAS derivatives With fluorinated C=C or C=O carbon
FC(F)(Cl)C(F)(Cl)Cl -32.573349 29.395290 PFAS derivatives PFAS halogen derivatives
C=C(F)F -69.444672 26.623600 PFAS derivatives With fluorinated C=C or C=O carbon
OC(C(F)(F)F)C(F)(F)F -22.384430 -27.337769 Other aliphatics Others
fig = px.scatter(tsne_df_joined,
                 x = "Component_1",
                 y = "Component_2",
                 color = "First_Class",
                 hover_name = tsne_df_joined.index,
                 hover_data = {"Component_1": ":.2f",
                               "Component_2": ":.2f",
                               "First_Class": True})

fig.update_layout(title = "Moléculas de PFAS mediante t-SNE",
                  xaxis_title = "Componente t-SNE 1",
                  yaxis_title = "Componente t-SNE 2",
                  height = 600, width = 900)
fig.show();

Ahora vamos a hacerlo en R3

tsne = TSNE(n_components = 3, perplexity = 50, random_state = 42)
fps_tsne = tsne.fit_transform(fps_array)
tsne_df = pd.DataFrame(data = fps_tsne,
                     columns = ["Component_1", "Component_2", "Component_3"],
                     index = pfas_data["RDKIT_SMILES"])
tsne_df_joined = pd.merge(tsne_df, pfas_classes, on = "RDKIT_SMILES", how = "inner")
tsne_df_joined.head()
Component_1 Component_2 Component_3 First_Class Second_Class
RDKIT_SMILES
FC(F)Cl -15.876364 24.557077 7.744306 PFAS derivatives PFAS halogen derivatives
FC(F)=C(F)F -24.740488 3.906794 -2.053560 PFAS derivatives With fluorinated C=C or C=O carbon
FC(F)(Cl)C(F)(Cl)Cl -9.861135 1.608409 8.375186 PFAS derivatives PFAS halogen derivatives
C=C(F)F -24.910532 3.875210 -1.773332 PFAS derivatives With fluorinated C=C or C=O carbon
OC(C(F)(F)F)C(F)(F)F -8.248621 -9.961960 -0.368669 Other aliphatics Others
fig = px.scatter_3d(tsne_df_joined,
                    x = "Component_1",
                    y = "Component_2",
                    z = "Component_3",
                    color = "First_Class",
                    hover_name = tsne_df_joined.index,
                    hover_data = {"Component_1": ":.2f",
                                  "Component_2": ":.2f",
                                  "Component_3": ":.2f",
                                  "First_Class": True})

fig.update_layout(scene = dict(xaxis_title = "Componente t-SNE 1",
                               yaxis_title = "Componente t-SNE 2",
                               zaxis_title = "Componente t-SNE 3"))
fig.show();